Code
library(sf)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)
library(rnaturalearth)
library(viridis)
library(scales)
library(RColorBrewer)
library(units)
library(knitr)
library(magick)
sf_use_s2(TRUE)This supplementary document provides the R code, statistics, figures, and maps used in the analyses for the paper
“Glottography: an open-source geolinguistic data platform for mapping the world’s languages.”
All scripts and functions used in data cleaning, visualisation, and statistical modeling are included below.
To reproduce the results, please ensure that all dependencies listed in this section are installed. Note that the Ethnologue data used for comparison are subject to third-party licensing restrictions and cannot be publicly shared.
library(sf)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)
library(rnaturalearth)
library(viridis)
library(scales)
library(RColorBrewer)
library(units)
library(knitr)
library(magick)
sf_use_s2(TRUE)# Custom functions for analysing Glottography data
#' Get all public repositories of an organisation on Github
#'
#' This function fetches all public repositories for a given GitHub organisation,
#' handling pagination automatically.
#'
#' @param org Character; the GitHub organisation name.
#' @return A character vector of repository names.
#'
get_public_repos <- function(org) {
repos <- gh::gh("/orgs/{org}/repos", org = org, type = "public", .limit = Inf)
# Extract repository names
repo_names <- sapply(repos, function(x) x$name)
non_data_repos <- c("tutorials", ".github", "pyglottography")
return(setdiff(repo_names, non_data_repos))
}
#' Load Glottography data sets from a set of handles
#'
#' This function downloads Glottography data sets for a given set of handles
#' and reads them as `sf` objects. For each handle, it constructs URLs
#' pointing to the corresponding feature, language, and family GeoJSON files
#' hosted at `base_url`. Files are downloaded if they do not exist locally
#' or if `force_reload = TRUE`.
#'
#' The function handles a special case for the `"asher2007world"` data set, which
#' has two sub-folders (`contemporary` and `traditional`) and produces two
#' separate data sets (e.g., `"asher2007world_contemporary"`).
#'
#' For each data type (`features`, `languages`, `families`), data sets for all
#' handles and sub data sets are combined into a single `sf` object.
#' An additional `"dataset"` column is added to indicate the origin of each row.
#'
#' @param glottography_handles Character vector of data set handles to load.
#' @param force_reload Logical; if `TRUE`, forces re-downloading of files even
#' if they already exist locally. Defaults to `FALSE`.
#'
#' @return A named list with three elements: `features`, `languages`, and `families`.
#' Each element is an `sf` object combining all handles and data sets,
#' with a `"dataset"` column indicating the origin of each row.
#'
load_glottography <- function(glottography_handles, force_reload = FALSE) {
base_url <- "https://raw.githubusercontent.com/Glottography/"
folder_path <- "refs/heads/main/cldf"
level_names <- c("features", "languages", "families")
# Load each handle
handles <- map(glottography_handles, function(handle) {
load_handle(handle, level_names, base_url, folder_path, force_reload)
})
# Combine all handles into sf objects
list(
features = map_dfr(handles, "features"),
languages = map_dfr(handles, "languages"),
families = map_dfr(handles, "families")
)
}
#' Get folder names for a given Glottography handle
#'
#' This function determines which folders to use for a given Glottography handle.
#' It handles the special case `"asher2007world"`, which has two
#' sub-folders (`contemporary` and `traditional`).
#' All other handles map to a single folder.
#'
#' @param handle Character; the Glottography dataset handle.
#' @param folder_path Character; the base path to the dataset folders.
#'
#' @return A data frame with two columns:
#' - `folder`: the path(s) to the folder(s) containing the data set(s).
#' - `dataset_name`: the corresponding name(s) for each data set.
#'
get_folders_per_handle <- function(handle, folder_path) {
# Special case for "asher2007world" with two sub-folders
if (handle == "asher2007world") {
data.frame(
folder = file.path(folder_path, c("contemporary", "traditional")),
dataset_name = c("asher2007world_contemporary",
"asher2007world_traditional"),
stringsAsFactors = FALSE
)
} else {
# Default case: single folder and data set name
data.frame(
folder = folder_path,
dataset_name = handle,
stringsAsFactors = FALSE
)
}
}
#' Load a single Glottography data level for a handle
#'
#' This function downloads or reads all GeoJSON files for a given Glottography
#' handle and data level (`features`, `languages`, or `families`). It handles
#' multiple folders (e.g., for `"asher2007world"`) and tags each data set with
#' a `dataset` column to indicate its origin.
#'
#' @param handle Character; the Glottography data set handle.
#' @param level Character; one of `"features"`, `"languages"`, or `"families"`.
#' @param base_url Character; base URL to the Glottography repository.
#' @param folders Data frame with columns `folder` and `dataset_name`, as returned
#' by `get_folders_per_handle()`.
#' @param force_reload Logical; if `TRUE`, forces re-downloading even if local files exist.
#'
#' @return An `sf` object combining all data sets for this handle and level, with
#' an additional `dataset` column indicating the source of each row.
#'
load_level <- function(handle, level, base_url, folders, force_reload) {
# Iterate over all folders/dataset_names for this handle
datasets <- purrr::pmap(folders, function(folder, dataset_name) {
# Local path to store/download the dataset
path <- file.path("data", "glottography", dataset_name)
cldf_file <- file.path(path, paste0(level, ".geojson"))
cldf_level <- paste0(level, ".geojson")
url <- file.path(base_url, handle, folder, cldf_level)
# Download or read the data set
dataset <- read_or_write_dataset(url, cldf_file, path, level, force_reload)
# Tag data set with its name
dataset$dataset <- dataset_name
dataset
}) |> bind_rows()
datasets
}
#' Load all levels for a single Glottography handle
#'
#' This function loads all data set levels (`features`, `languages`, `families`)
#' for a given Glottography handle. It uses `get_folders_per_handle()` to
#' determine which folders and data set names to process, then calls
#' `load_level()` for each level.
#'
#' @param handle Character; the Glottography data set handle.
#' @param level_names Character vector; names of levels to load, e.g.,
#' `c("features", "languages", "families")`.
#' @param base_url Character; base URL to the Glottography repository.
#' @param folder_path Character; base path to data set folders in the repository.
#' @param force_reload Logical; if `TRUE`, forces re-downloading even if local files exist.
#'
#' @return A named list with elements corresponding to each level. Each element
#' is an `sf` object combining all data sets for that level, with a
#' `dataset` column indicating the source of each row.
#'
load_handle <- function(handle, level_names, base_url, folder_path, force_reload) {
folders <- get_folders_per_handle(handle, folder_path)
levels <- map(level_names, function(level) {
load_level(handle, level, base_url, folders, force_reload)
})
names(levels) <- level_names
levels
}
#' Download or read a Glottography data set
#'
#' This function downloads a GeoJSON file from a given URL if it does not exist
#' locally (or if `force_reload = TRUE`) and then reads it into R as an `sf` object.
#'
#' @param url Character; the URL of the GeoJSON file to download.
#' @param cldf_file Character; local file path where the dataset should be saved.
#' @param path Character; directory path in which to save the file.
#' @param level Character; dataset level (`features`, `languages`, or `families`) — used for reference.
#' @param force_reload Logical; if `TRUE`, forces re-downloading the file even if it exists locally.
#'
#' @return An `sf` object containing the dataset.
#'
read_or_write_dataset <- function(url, cldf_file, path, level, force_reload) {
# Create local directory if it doesn't exist
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}
# Download the file if missing or if forced
if (!file.exists(cldf_file) || isTRUE(force_reload)) {
download.file(url, cldf_file, mode = "wb")
}
# Read the data set into R as an sf object
sf::st_read(cldf_file, quiet = TRUE) |>
sf::st_zm(drop = TRUE, what = "ZM") |>
mutate(across(any_of("id"), as.character))
}
#' Load Glottolog language data from the official URL
#'
#' This function downloads the Glottolog languoid CSV zip file (if not already
#' present), extracts it, and loads it as an `data.frame` object.
#'
#' @return A `data.frame` object of Glottolog languoids
load_glottolog <- function() {
url = "https://cdstar.eva.mpg.de//bitstreams/EAEA0-2198-D710-AA36-0/glottolog_languoid.csv.zip"
path <- file.path("data", "glottolog")
zip_file <- file.path(path, "glottolog.zip")
csv_file <- file.path(path, "languoid.csv")
# Create folder if it does not exist
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}
# Download if zip does not exist
if (!file.exists(zip_file)) {
download.file(url, zip_file, mode = "wb")
}
# Unzip if CSV does not exist
if (!file.exists(csv_file)) {
unzip(zip_file, exdir = dirname(zip_file))
}
# Read CSV
glottolog <- read.csv(csv_file)
}
#' Load Natural Earth polygons (land or ocean)
#'
#' This function loads Natural Earth geometries for land, ocean,
#' populated places, countries or rivers as an `sf` object.
#' It first checks for a cached GeoPackage under `data/natural_earth`
#' (e.g., `ne_land.gpkg` or `ne_ocean.gpkg`). If the file exists, it is read
#' directly. Otherwise, the data is downloaded from Natural Earth using
#' `rnaturalearth`, stored as an `sf` object, and saved locally for future reuse.
#'
#' @param type Character; `"land"`, `"ocean", "populated_places"`, `"countries"`
#' `"rivers_lake_centerlines"`, `"lakes"` specifying which data to load.
#'
#' @return An `sf` object representing the requested Natural Earth geometreis
#'
load_ne <- function(type = c("land", "ocean", "populated_places",
"countries", "rivers_lake_centerlines", "lakes")) {
type <- match.arg(type)
if (type %in% c("land", "ocean", "rivers_lake_centerlines",
"lakes")){
category = "physical"
} else if (type %in% c("populated_places", "countries")) {
category = "cultural"
}
path <- file.path("data", "natural_earth")
ne_file <- file.path(path, paste0("ne_", type, ".gpkg"))
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE)
}
if (file.exists(ne_file)) {
# Read cached file
ne_sf <- sf::st_read(ne_file, quiet = TRUE)
} else {
# Download from Natural Earth
ne_sf <- rnaturalearth::ne_download(
scale = "large",
type = type,
category = category,
returnclass = "sf"
)
# Save as GeoPackage for future reuse
sf::st_write(ne_sf, ne_file,
layer_options = "GEOMETRY_NAME=geometry",
quiet = TRUE, delete_layer = TRUE)
}
ne_sf
}
#' Create a land mask for cutting the grid
#'
#' This function identifies all language polygons that **do not intersect land**,
#' creates a small buffer around them, combines these with the land layer, and returns
#' a single unified mask polygon. This mask can then be used to crop or "cookie-cut"
#' grid cells for visualization or analysis.
#'
#' @param land_polygon sf object in EPSG:3857. Polygons representing land areas (e.g., continents).
#' @param languages sf object in EPSG:3857. Polygons representing language distributions.
#'
#' @return An `sf` object of class MULTIPOLYGON representing the combined mask in EPSG:3857:
#' - Land polygons from `land_polygon`
#' - Buffered language polygons not intersecting land
#'
#' @details
#' - The buffer applied to language polygons is 100 units (assumes same CRS as `languages`).
#' - The function casts polygons to MULTIPOLYGON and unions all geometries into a single mask.
#' - Useful for filtering a grid so that only land and relevant language areas are retained.
#'
create_land_mask <- function(land_polygon, languages){
# Find which languages intersect land
glottolog_land_intersections <- st_intersects(languages, land_polygon)
# Keep only languages not intersecting land, buffer them, and combine with land
mask <- languages |>
filter(lengths(glottolog_land_intersections) == 0) |>
mutate(geometry = st_buffer(geometry, 100)) |>
select(geometry) |>
st_cast("MULTIPOLYGON") |>
bind_rows(land_polygon) |>
st_union()
return(mask)
}
#' Create a global equal area hexagonal grid clipped to a mask
#'
#' Generates a global equal area hexagonal grid at the specified resolution,
#' optionally clips the grid cells to those intersecting a mask.
#' The grid is in Equal Earth (world) projection (EPSG 8857).
#'
#' @param resolution Numeric. Cell size of the hexagons, expressed in meters
#' @param mask sf object in EPSG 8857 or `NULL`. An optional mask geometry (e.g., continents).
#' If provided, only grid cells intersecting the mask are retained.
#' If `NULL`, the full grid covering the bounding box is returned.
#'
#' @return An `sf` object containing the hexagonal grid cells that intersect
#' the mask (if provided), with a unique `id` column.
#'
create_world_grid_eq_area <- function(resolution, mask = NA) {
offset <- 0.1
bbox <- st_bbox(c(xmin=-180 + offset, ymin=-90,
xmax=180 - offset, ymax=90), crs = 4326) |>
st_as_sfc() |>
st_transform(3857) |>
st_segmentize(units::set_units(100000, "m")) |>
st_transform(8857)
# Create hexagonal grid
grid <- st_make_grid(
bbox,
what = "polygons",
cellsize = resolution,
square = FALSE,
flat_topped = FALSE
) |> st_as_sf() |> st_set_geometry("geom")
if (!is.null(mask)){
intersections <- st_intersects(grid, mask)
grid <- grid |> filter(lengths(intersections) > 0)
}
grid <- grid |>
mutate(id = row_number())
return(grid)
}
#' Create an ocean mask for visualising grid cells in Equal Earth projection
#'
#' Constructs a global mask covering the full map extent (EPSG: 8857)
#' and subtracts an ocean polygon from it, leaving a "negative space" mask that can be
#' plotted to visually hide grid cells in the ocean or out-of-bounds.
#'
#' @param ocean_polygon `sf` object. A polygon geometry representing oceans,
#' provided in EPSG:3857 (Web Mercator).
#' @param buffer Numeric. Buffer distance (in meters) to expand the bounding box
#' after reprojection. Ensures that the mask fully covers the plotting extent.
#'
#' @return An `sf` polygon geometry representing the mask (land
#' with ocean cut out).
create_ocean_mask_eq_area <- function(ocean_polygon, buffer) {
offset <- 0.01
# create a visual mask hiding all grid cells in the ocean or out of bounds
bbox <- st_bbox(c(xmin=-180 -offset , ymin=-90 ,
xmax=180 + offset , ymax=90), crs = 4326) |>
st_as_sfc() |>
st_transform(3857) |>
st_segmentize(units::set_units(100000, "m")) |>
st_transform(8857)
out_of_bounds <- st_buffer(bbox, buffer) |>
st_difference(bbox)
mask <- st_union(out_of_bounds, ocean_polygon |> st_transform(8857))
return (mask)
}
#' Create a bounding box polygon cutting off the poles in Equal Earth projection (EPSG 8857).
#'
#' This function creates a bounding box polygon in latitude/longitude (EPSG:4326),
#' reprojects it to EPSG:8857
#'
#' @param north Numeric. The northern latitude limit in degrees.
#' @param south Numeric. The southern latitude limit in degrees.
#'
#' @return An sf object, the bounding box polygon excluding the poles
#'
get_no_pole_bbox <- function(north, south) {
bbox_no_poles <- st_sfc(
st_polygon(list(rbind(
c(-180, south),
c(-180, north),
c(180, north),
c(180, south),
c(-180, south)
))),
crs = 4326
) |> st_transform(3857)|>
st_segmentize(units::set_units(100000, "m")) |>
st_transform(8857)
return(bbox_no_poles)
}
#' Define an expanded map extent around polygons
#'
#' Computes a bounding box around an `sf` object and expands it
#' horizontally and vertically by user-defined offsets. Returns
#' the expanded bounding box as an `sfc` object.
#'
#' @param polygons An `sf` object (polygons) for which the extent is defined.
#' @param x_offset Numeric. Fraction of the x-extent by which to expand
#' the bounding box on both sides (e.g., 0.1 = 10%).
#' @param y_offset Numeric. Fraction of the y-extent by which to expand
#' the bounding box on both sides.
#'
#' @return An `sfc` object representing the expanded bounding box
#' with the same CRS as `polygons`.
#'
define_map_extent <- function(polygons, x_offset, y_offset){
poly_bbox <- st_bbox(polygons)
extent_x <- poly_bbox[["xmax"]] - poly_bbox[["xmin"]]
extent_y <- poly_bbox[["ymax"]] - poly_bbox[["ymin"]]
poly_bbox_offset <- c(
xmin = poly_bbox[["xmin"]] - extent_x * x_offset,
xmax = poly_bbox[["xmax"]] + extent_x * x_offset,
ymin = poly_bbox[["ymin"]] - extent_y * y_offset,
ymax = poly_bbox[["ymax"]] + extent_y * y_offset
)
map_bbox <- st_as_sfc(st_bbox(poly_bbox_offset, crs = st_crs(polygons)))
return(map_bbox)
}
#' Build a Language Map
#'
#' Creates a thematic map visualising the spatial specified polygons language polygons.
#' The map integrates multiple spatial layers (land, ocean, countries,
#' lakes, and cities) and applies user-defined parameters for extent, labeling, and colors.
#'
#' @param poly An `sf` object containing polygons representing regions or language areas.
#' @param ds Character string. Dataset label or name to be used as the legend title.
#' @param map_parameters A list of map customization parameters, including:
#' - `remove_country_label`: Vector of country names to exclude from labeling.
#' - `color`: Vector of fill colors corresponding to `poly$name`.
#' - `nudge_x_country_name`, `nudge_y_country_name`: Numeric offsets for country name labels.
#' - `city_scale_rank`: Numeric threshold controlling which cities are displayed.
#' @param map_extent An `sfc` object defining the spatial extent (bounding box) for cropping map layers.
#'
#' @return A `ggplot` object representing the assembled map with customized styling,
#' suitable for further annotation or saving as an image.
#'
build_language_map <- function(poly, ds, map_parameters, map_extent) {
cities_crop <- ne_city |>
st_filter(map_extent) |>
filter(SCALERANK < params$city_scale_rank)
ne_country_labels <- ne_country |>
filter(!NAME %in% map_parameters$remove_country_label)
ggplot() +
geom_sf(data = ne_land, fill = "white") +
geom_sf(data = ne_ocean, fill = "#D0E1F2", color = NA) +
geom_sf(data = ne_country, fill = "white", color = "grey", linewidth = 0.5) +
geom_sf(data = ne_lake, fill = "#7FB4D6", color = NA) +
geom_sf(data = poly, aes(fill = name), color = NA, alpha = 0.5) +
geom_sf(data = cities_crop) +
geom_sf_text(
data = ne_country_labels, aes(label = toupper(NAME)),
size = 9, check_overlap = TRUE,
nudge_x = map_parameters$nudge_x_country_name,
nudge_y = map_parameters$nudge_y_country_name,
fontface = "bold", color = "grey"
) +
geom_sf_text(
data = cities_crop |> filter(SCALERANK < params$city_scale_rank),
aes(label = NAME),
size = 8,
nudge_y = 48000,
nudge_x = 20000
) +
scale_fill_manual(
values = setNames(map_parameters$color, poly$name),
labels = NULL,
name = paste0(ds)
) +
coord_sf(
xlim = st_bbox(map_extent)[c("xmin", "xmax")],
ylim = st_bbox(map_extent)[c("ymin", "ymax")]
) +
guides(
fill = guide_legend(
override.aes = list(fill = NA) # hides the boxes/symbols
))+
theme_void() +
theme(
panel.background = element_rect(fill = "white", color = NA),
legend.position = c(1., 1.),
legend.margin = margin(t = 10, r = 10, b = 10, l = 10),
legend.text = element_text(size = 1, hjust = 1),
legend.title = element_text(size = 25, face = "bold", hjust = 1),
legend.justification = c("right", "top"),
legend.background = element_rect(fill = "white", color = NA),
legend.key = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white")
)
}
#' Build a Summary Sheet for a Polygon
#'
#' Generates a visual summary (information sheet) for a given polygon,
#' displaying its name, glottocode, and data sources in a clean,
#' publication-style layout. The output is a simple `ggplot` graphic
#' with customisable color and spacing.
#'
#' @param poly An `sf` object containing a single polygon with attributes:
#' - `name`: The polygon's label or language name.
#' - `glottocode`: Unique identifier (e.g., from Glottolog).
#' - `dataset`: Source dataset name(s).
#' @param params A list of styling parameters, including:
#' - `color`: Fill color for the main rectangle.
#'
#' @return A `ggplot` object displaying the polygon’s name, glottocode,
#' and data sources formatted within a white background panel.
#'
build_info_sheet <- function(poly, params) {
name <- poly |> st_drop_geometry() |> select(name) |> pull() |> unique()
glottocode <- poly |> st_drop_geometry() |> select(glottocode) |> pull() |> unique()
sources <- poly |> st_drop_geometry() |> select(dataset) |> pull() |> unique()
# Rectangle and main text position
x_rect <- 0.05
y_rect <- 0.5
rect_width <- 0.10
rect_height <- 0.10
x_text <- x_rect + rect_width + 0.02
y_text <- y_rect
gap <- 0.1
# Sources: place below the main text with spacing
spacing <- 0.1 # vertical spacing between items
df_sources <- data.frame(
x = rep(x_rect, length(sources) + 1),
y = y_rect - rect_height/2 - gap - seq(spacing, spacing * (length(sources) + 1), by = spacing),
label = c("Sources:", paste0(" - ", sources)),
fontface = c("bold", rep("plain", length(sources)))
)
ggplot() +
# main colored rectangle
annotate("rect",
xmin = x_rect,
xmax = x_rect + rect_width,
ymin = y_rect - rect_height/2,
ymax = y_rect + rect_height/2,
fill = params$color) +
# main text (name + glottocode)
annotate("text",
x = x_text,
y = y_text,
label = paste0(name, " (", glottocode, ")"),
size = 11,
fontface = "bold",
hjust = 0) +
# sources below
#geom_text(data = df_sources, aes(x = x, y = y, label = label, fontface = fontface),
# hjust = 0, size = 8) +
coord_fixed() + # fixes aspect ratio to 1:1 (square)
xlim(0, 1) + ylim(0, 1) +
theme_void() +
theme(
plot.margin = margin(10, 10, 10, 10),
plot.background = element_rect(fill = "white", color = NA)
)
}
#' Draw a Line on an Image
#'
#' Opens a drawing context on an image and draws a line segment
#' between specified coordinates. Useful for adding separators or
#' borders in image montages.
#'
#' @param img An image object (from `magick::image_read()`).
#' @param x0, y0 Numeric. Starting coordinates of the line.
#' @param x1, y1 Numeric. Ending coordinates of the line.
#' @param color Character. Line color (e.g., `"black"`, `"#FFFFFF"`).
#' @param stroke Numeric. Line width in pixels (default: 10).
#'
#' @return A modified image object with the line drawn on it.
#'
#'
draw_line <- function(img, x0, y0, x1, y1, color, stroke = 10) {
out <- image_draw(img) # open drawing context
# Draw the line
segments(
x0 = x0,
y0 = y0,
x1 = x1,
y1 = y1,
col = color,
lwd = stroke
)
dev.off() # close drawing context
out
}
#' Create an Image Montage with Separators
#'
#' Assembles multiple images into a structured montage grid with defined
#' margins and separator lines. Designed for creating visual summaries
#' or composite figures from example plots.
#'
#' @param image_paths Character vector of image file paths to include in the montage.
#' @param margin_w, margin_h Numeric. Horizontal and vertical margins (in pixels)
#' between images.
#' @param cols, rows Integer. Number of columns and rows in the montage layout.
#' @param separator_width Numeric. Thickness (in pixels) of separator lines.
#' @param separator_color Character. Color of separators and outer border.
#'
#' @return A `magick-image` object representing the complete montage. The image
#' is also written to `all_examples.png` within `plots_path`.
#'
#' @details
#' Combines images using `magick::image_montage()` and overlays separator lines
#' using [`draw_line()`]. Margins and separator dimensions can be tuned for
#' publication-quality figure arrangements.
#'
#'
create_montage <- function(image_paths, margin_w, margin_h, cols, rows,
separator_width, separator_color){
images <- image_read(image_paths)
montage <- image_montage(
image_join(images),
tile = paste0(cols, "x", rows),
geometry = paste0("+", margin_w,"+", margin_h),
bg = "white"
)
cell_w_no_margin <- rep(image_info(images)
|> select(width)
|> unique()
|> pull(), cols)
cell_h_no_margin <- image_info(images) |>
select(height) |>
unique() |>
pull()
cell_w <- cumsum(cell_w_no_margin + 2 * margin_w)
cell_h <- cumsum(cell_h_no_margin + 2 * margin_h)
montage <- draw_line(
montage,
x0 = 0,
y0 = cell_h[1],
x1 = cell_w [4],
y1 = cell_h[1],
color = separator_color,
stroke = separator_width
)
# Vertical line between col 4 and 5
montage <- draw_line(
montage,
x0 = cell_w[4],
y0 = 0,
x1 = cell_w[4],
y1 = cell_h[1],
color = separator_color,
stroke = separator_width
)
for (row in 2:3) {
montage <- draw_line(
montage,
x0 = 0,
y0 = cell_h[row],
x1 = cell_w[5],
y1 = cell_h[row],
color = separator_color,
stroke = separator_width
)
}
image_write(image_border(montage,
color = separator_color,
geometry = paste0(separator_width, "x",
separator_width)),
path = file.path(plots_path,"all_examples.png"), format = "png",
density = 100)
return (montage)
}
#' Save multiple ggplot objects to files
#'
#' This function takes a named list of ggplot objects and saves each plot to a
#' file using its list name as the file name. The file extension, dimensions,
#' and resolution can be customized.
#'
#' @param plots Named list of ggplot objects. Each element should be a plot,
#' and the name will be used as the file name.
#' @param base_folder Character. Base folder to save the plots.
#' @param width Numeric. The width of the saved plots in inches. Default is 15.
#' @param height Numeric. The height of the saved plots in inches. Default is 8.
#' @param dpi Numeric. The resolution (dots per inch) of the saved plots. Default is 300.
#' @param extension Character. File extension for the saved plots (e.g., ".pdf", ".png").
#' Default is ".pdf".
#'
save_plots <- function(plots, base_folder, width = 15, height = 8, dpi = 300, extension = ".pdf") {
plots |>
imap(~ ggsave(
filename = file.path(base_folder, paste0(.y, extension)),
plot = .x,
width = width,
height = height,
dpi = dpi
))
}We load the following four datasets:
Glottography: all currently available data sets containing language polygons from Glottography.
glottography_repos <- get_public_repos("Glottography")
glottography <- load_glottography(
glottography_handles = glottography_repos,
force_reload = FALSE) |>
map(\(x) x |>
st_transform(3857)) #|>
#filter(dataset != "asher2007world_traditional"))Glottolog: all languages with point coordinates and language families from Glottolog, including all their known members classified as languages. Glottolog is a comprehensive reference catalog of the world’s languages and their genealogical relationships (Hammarström et al. 2025).
glottolog_languages <- load_glottolog() |>
filter(level == "language") |>
filter(!is.na(latitude) & !is.na(longitude)) |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |>
st_transform(3857)
glottolog_families <- load_glottolog() |>
filter(level == "family") |>
filter(parent_id == "")Ethnologue: language polygons from Ethnologue’s World Mapping Service. Ethnologue is a subscription-based database providing information on the world’s languages, including their geographic distribution, number of speakers, and linguistic classification (Eberhard, Simons, and Fennig 2024).
ethnologue <- st_read("data/ethnologue/ethnologue_polygons.gpkg", quiet = T)|>
st_transform(3857)Natural Earth: land and ocean polygons from Natural Earth Data (South, Michael, and Massicotte 2025). These polygons provide geographic context and are used for visualising the language data.
ne_land <- load_ne("land") |>
st_transform(3857)
ne_ocean <- load_ne("ocean") |>
st_transform(3857)
ne_city <- load_ne("populated_places")|>
st_transform(3857)
ne_country <- load_ne("countries") |>
st_transform(3857)
ne_river <- load_ne("rivers_lake_centerlines") |>
st_transform(3857)
ne_lake <- load_ne("lakes") |>
st_transform(3857)This section presents several plots illustrating both the spatial and thematic coverage of the Glottography data. To begin, we create an empty hexagonal world grid. This grid serves as a spatial framework for visualising areal statistics, such as the number of unique languages per geographic region recorded in Glottography.
# Areas at the poles are excluded
no_poles <- get_no_pole_bbox(north = 90, south = -55)
land_mask <- create_land_mask(ne_land, glottography$languages) |>
st_transform(8857) |> st_intersection(no_poles)
ocean_mask <- create_ocean_mask_eq_area(ocean_polygon = ne_ocean, 1000000)
# Create an equal area world grid
world_grid <- create_world_grid_eq_area(500000,
mask = land_mask) n_features <- nrow(glottography$features)
n_languages <- nrow(glottography$languages)
n_families <- nrow(glottography$families)
n_features_distinct <- glottography$features |>
st_drop_geometry() |>
select(glottocode = cldf.languageReference) |>
distinct(glottocode) |>
nrow()
n_languages_distinct <- glottography$languages |>
st_drop_geometry() |>
select(glottocode = cldf.languageReference) |>
distinct(glottocode) |>
nrow()
n_families_distinct <- glottography$families |>
st_drop_geometry() |>
select(glottocode = cldf.languageReference) |>
distinct(glottocode) |>
nrow()Currently, Glottography offers 19606 feature polygons (7514 of which are distinct), 13119 language polygons of 5337 distinct languages, and 1387 family polygons of 394 distinct families.
To assess the spatial polygon density of the Glottography data, we intersect the hexagonal world grid with the language polygons from Glottography. For each grid cell, we compute the number of unique languages present. The resulting map in Figure S 1 illustrates the geographic distribution of languages in Glottography. It should not be interpreted as a map of language diversity, as some regions may be covered by many sources, while others are represented by only a few, which can skew the polygon density statistics.
glottography_per_cell <- st_join(world_grid,
glottography$languages |> st_transform(8857) |>
select(glottocode = cldf.languageReference),
join = st_intersects, left = F) |>
distinct(id, glottocode)
density_glottography <- world_grid |>
left_join(glottography_per_cell |>
group_by(id) |>
tally(name = "n_glottography"),
by = "id") |>
mutate(n_glottography = ifelse(is.na(n_glottography),
0, n_glottography))cell_size <- world_grid$geom |> st_area()|> set_units(km^2) |> mean()
cell_size_label <- paste0(format(round(as.numeric(cell_size), -4), big.mark = ",") , " km²")
map_limits <- c(-16243909, 16243909, -6585838, 8387503)
density_plot <- ggplot() +
geom_sf(data = density_glottography, aes(fill = n_glottography), color = NA) +
# Mask oceans (white overlay)
geom_sf(data = ocean_mask, fill = "white", color = "white") +
# Continuous color scale for coverage
scale_fill_viridis_c(
option = "viridis",
name = "Unique Glottography\npolygons per cell\n",
breaks = c(0, 10, 40, max(density_glottography$n_glottography)),
labels = c("0", "10", "40", as.character(max(density_glottography$n_glottography))),
trans = pseudo_log_trans(sigma = 1)
) +
annotate("text",
x = 0, y = -5085838,
label = paste0("Cell size: ", cell_size_label),
hjust = 0, vjust = 1,
size = 5) +
coord_sf(
crs = st_crs(8857),
xlim = map_limits[1:2],
ylim = map_limits[3:4]
) +
theme_void() +
theme(
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
legend.title = element_text(size = 14, lineheight = 1.2),
legend.text = element_text(size = 12),
plot.caption = element_text(size = 14, hjust = 0.6),
legend.position = c(0.18, 0.25)
)density_plotNext, we compare the spatial coverage of Glottography with that of Ethnologue. We intersect the hexagonal grid with the language polygons from both datasets, count the number of unique languages in each grid cell, and compute the difference (Glottography minus Ethnologue) (see, Figure S 2)
ethnologue_per_cell <- st_join(world_grid,
ethnologue |> st_transform(8857) |> select(-id),
join = st_intersects, left = F) |>
distinct(id, lang_iso)
glottography_ethnologue_comparison <- world_grid |>
left_join(ethnologue_per_cell |>
group_by(id) |>
tally(name = "n_ethnologue"),
by = "id") |>
mutate(n_ethnologue = ifelse(is.na(n_ethnologue),
0, n_ethnologue)) |>
left_join(glottography_per_cell |>
group_by(id) |>
tally(name = "n_glottography"),
by = "id") |>
mutate(n_glottography = ifelse(is.na(n_glottography),
0, n_glottography))|>
mutate(diff = n_glottography - n_ethnologue)pal <- brewer.pal(11, "RdBu")
glottography_ethnologue_plot <- ggplot(glottography_ethnologue_comparison) +
geom_sf(aes(fill = diff), color = NA) +
geom_sf(data = ne_land, fill = NA, color = "grey") +
geom_sf(data = ocean_mask, fill = "white", color = "white") +
scale_fill_gradient2(
low = pal[1],
mid = "#FFFFFF",
high = pal[11],
midpoint = 0,
breaks = c(min(glottography_ethnologue_comparison$diff),
-10, 0, 10, max(glottography_ethnologue_comparison$diff)),
labels = c(as.character(min(glottography_ethnologue_comparison$diff)),
"-10", "0", "10", as.character(max(glottography_ethnologue_comparison$diff))),
trans = pseudo_log_trans(sigma = 1),
name = "Difference in unique \nlanguage polygons per cell\n\n(Glottography minus Ethnologue)\n"
) +
annotate("text",
x = 0, y = -5085838,
label = paste0("Cell size: ", cell_size_label),
hjust = 0, vjust = 1,
size = 5) +
coord_sf(
crs = st_crs(8857),
xlim = map_limits[1:2],
ylim = map_limits[3:4]
) +
theme_void() +
theme(
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
legend.title = element_text(size = 14, lineheight = 1.2),
legend.text = element_text(size = 12),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.caption = element_text(size = 14),
legend.position = c(0.18, 0.25)
) glottography_ethnologue_plotGlottolog provides point coordinates for most languages, so we need a different approach for comparison and cannot rely on the hexagonal grid. Our method is straightforward: we check which languages in Glottolog have a corresponding polygon in Glottography (see, Figure S 3).
glottolog_glottography <- glottolog_languages |>
select(glottocode = id) |>
rename(geom = geometry) |>
left_join(glottography$languages |>
select(glottocode = cldf.languageReference) |>
st_drop_geometry() |>
mutate(in_glottography = 1),
by = "glottocode") |>
mutate(in_glottography = if_else(is.na(in_glottography), 0L, 1L))|>
distinct(glottocode, geom, in_glottography)
glottolog_glottography_stats <- glottolog_glottography |>
st_drop_geometry() |>
summarise(
in_glottolog = n(),
in_glottography = sum(as.numeric(in_glottography)),
.groups = "drop") |>
mutate(label_out = paste0(round((in_glottolog - in_glottography)/in_glottolog * 100, 1), "%"),
label_in = paste0(round(in_glottography/in_glottolog * 100, 1), "%"))glottography_glottolog_plot <- ggplot(
glottolog_glottography |>
mutate(in_glottography = factor(in_glottography, levels = c(1, 0)))) +
# smaller, semi-transparent points
geom_sf(aes(color = factor(in_glottography)), size = 0.8, alpha = 0.6) +
# land and ocean layers
geom_sf(data = ne_land, fill = NA, color = "grey", linewidth = 1) +
geom_sf(data = ocean_mask, fill = "white", color = "white") +
# custom colors and labels
scale_color_manual(
values = c("1" = pal[10], "0" = pal[2]),
labels = c("1" = paste0("in Glottography (", glottolog_glottography_stats$label_in, ")"),
"0" = paste0("not in Glottography (", glottolog_glottography_stats$label_out, ")")),
name = "Glottolog language\n") +
# annotation
annotate("text",
x = 0, y = -5085838,
label = paste0("Total languages in Glottolog: ",
glottolog_glottography_stats$in_glottolog),
hjust = 0, vjust = 1,
size = 5) +
# projection and limits
coord_sf(
crs = st_crs(8857),
xlim = map_limits[1:2],
ylim = map_limits[3:4]
) +
# minimal theme
theme_void() +
theme(
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
legend.title = element_text(size = 14, lineheight = 1.2, hjust = 0), # align left
legend.text = element_text(size = 14),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.caption = element_text(size = 14),
legend.position = c(0.18, 0.25)
)glottography_glottolog_plotTo evaluate the completeness of Glottography per linguistic family, we calculate the proportion of languages represented in Glottography relative to the total number of known languages in that family as per Glottolog. This allows us to identify which families are well-covered and which have limited representation. Figure S 4 shows the 25 families with the most languages and their coverage in Glottography.
languages_per_family <- glottolog_families |>
select(family_name = name, family_glottocode = id) |>
left_join(glottolog_languages |>
rename(family_glottocode = family_id,
glottocode = id),
by = "family_glottocode") |>
left_join(glottography$languages |>
as.data.frame() |>
mutate(in_glottography = 1)|>
distinct(glottocode = cldf.languageReference, in_glottography)
, by = "glottocode") |>
mutate(in_glottography = replace_na(in_glottography, 0)) |>
group_by(family_glottocode, family_name) |>
summarise(n_languages = n(),
n_in_glottography = sum(in_glottography),
.groups = "drop") |>
filter(!family_name %in% c("Bookkeeping", "Sign Language")) |>
arrange(desc(n_languages)) families_top25_coverage_plot <- languages_per_family |>
head(25) |>
mutate(n_not_in_glottography = n_languages - n_in_glottography) |>
pivot_longer(
cols = c(n_in_glottography, n_not_in_glottography),
names_to = "type", values_to = "n"
) |>
mutate(type = factor(type, levels = c("n_not_in_glottography", "n_in_glottography"))) |>
ggplot(aes(x = reorder(family_name, -n_languages), y = n, fill = type)) +
geom_col() +
coord_flip() +
scale_fill_manual(
name = "Languages\n",
values = c("n_in_glottography" = pal[9],
"n_not_in_glottography" = pal[7]),
labels = c("n_in_glottography" = "in Glottography",
"n_not_in_glottography" = "not in Glottography"),
breaks = c("n_in_glottography", "n_not_in_glottography")) +
labs(x = "Family", y = "Number of languages") +
theme_minimal() +
theme(
legend.title = element_text(size = 14, lineheight = 1.2, hjust = 0), # align left
legend.text = element_text(size = 14),
legend.background = element_rect(fill = "white", color=NA ),
axis.text = element_text(size = 14),
axis.title = element_text(size = 14),
plot.caption = element_text(size = 14),
legend.position = c(0.8, 0.4)
)families_top25_coverage_plotNext, we compute the average number of sources across all languages (see, Figure S 5) and compare distinct sources in four selected languages (see, Figure S 6).
sources_per_language <- glottography$languages |>
rename(glottocode = cldf.languageReference,
name = title)|>
st_drop_geometry() |>
group_by(glottocode, name) |>
summarise(datasets = list(dataset),
n_sources = n(),
.groups = "drop")
mean_sources_per_language = sources_per_language |>
summarise(mean_n_sources = mean(n_sources))
sources_per_language_plot <- ggplot(sources_per_language, aes(x = factor(n_sources))) +
geom_bar(fill = pal[11]) +
labs(x = "Number of sources per language", y = "Count") +
theme_minimal() +
theme(
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16)
)sources_per_language_plotOn average, each language has 2.46 sources.
plots_path <- file.path("plots/language_examples")
if (!dir.exists(plots_path)) {
dir.create(plots_path, recursive = TRUE)
}
# Define languages and plotting parameters
glottocodes_for_map <- list(shon1251 = list(color = "#C8CFA3",
x_offset = 0.0,
y_offset = 0.12,
ncol_plot = 3,
city_scale_rank = 4,
nudge_x_country_name = 80000,
nudge_y_country_name = 30000,
remove_country_label = c("Botswana",
"Mozambique"),
has_traditional = FALSE),
beng1280 = list(color = "#C6B7D2",
x_offset = 0.02,
y_offset = 0.02,
ncol_plot = 3,
city_scale_rank = 3,
nudge_x_country_name = 20000,
nudge_y_country_name = 120000,
remove_country_label = "",
has_traditional = FALSE),
algo1255 = list(color = "#E2B8A1",
x_offset = 0.15,
y_offset = 0.15,
ncol_plot = 3,
city_scale_rank = 3,
nudge_x_country_name = 4800000,
nudge_y_country_name = -2400000,
remove_country_label = "",
has_traditional = TRUE),
bulg1262 = list(color = "#D9C7A6",
x_offset = 0.03,
y_offset = 0.10,
ncol_plot = 4,
city_scale_rank = 4,
nudge_x_country_name = 10000,
nudge_y_country_name = 20000,
remove_country_label = c("North Macedonia",
"Kosovo", "Serbia"),
has_traditional = FALSE))
example_plots <- vector()
for (i in seq_along(glottocodes_for_map)) {
params <- glottocodes_for_map[[i]]
current_glottocode <- names(glottocodes_for_map)[i]
polygons <- glottography$languages |>
rename(glottocode = cldf.languageReference, name = title) |>
filter(glottocode == current_glottocode)
datasets <- unique(polygons$dataset)
map_extent <- define_map_extent(polygons,
params$x_offset,
params$y_offset)
map_bbox <- st_bbox(map_extent)
ratio <- as.numeric((map_bbox["ymax"] - map_bbox["ymin"]) /
(map_bbox["xmax"] - map_bbox["xmin"]))
width_single_plot <- 7
height_single_plot <- width_single_plot * ratio
language_maps = list()
for (dataset_name in datasets) {
if (!isTRUE(params$has_traditional) && dataset_name == "asher2007world_traditional") {
next}
if (!isTRUE(params$has_traditional) && dataset_name == "asher2007world_contemporary") {
plot_name = "asher2007world*"} else{
plot_name = dataset_name
}
polygon_single_source <- polygons |>
filter(dataset == dataset_name)
single_plot <- build_language_map(
polygon_single_source,
plot_name,
params,
map_extent)
language_maps[[dataset_name]] <- single_plot
save_plots(setNames(list(single_plot),
paste0(current_glottocode, "_", plot_name)),
base_folder = plots_path,
extension = ".png",
width = width_single_plot,
height = height_single_plot,
dpi = 100)
example_plots <- c(example_plots, paste0(current_glottocode, "_",
plot_name, ".png"))
}
info_sheet <- build_info_sheet(polygons,
params)
save_plots(setNames(list(info_sheet), current_glottocode),
base_folder = plots_path,
extension = ".png",
width = width_single_plot,
height = height_single_plot,
dpi =100)
example_plots <- c(example_plots, paste0(current_glottocode, ".png"))
}
# Combine all plots
plot_order <- c(1:4, 10, 5:9, 11:20)
all_example_maps <- create_montage(image_paths = file.path(plots_path,
example_plots[plot_order]),
margin_w = 10, margin_h = 40,
cols = 5, rows = 4, separator_color = "grey",
separator_width = 5)all_example_mapsFinally, we assess source coverage by counting the number of unique language polygons linked to each source (see, Figure S 7),
languages_per_source <- glottography$languages |>
st_drop_geometry() |>
group_by(dataset) |>
summarise(n_languages = n())languages_per_source_plot <- languages_per_source |>
ggplot(aes(x = reorder(dataset, -n_languages), y = n_languages)) +
geom_col() +
coord_flip() +
labs(
x = "Source",
y = "Number of language polygons"
) +
theme_minimal() +
theme(
legend.title = element_text(size = 14), # align left
legend.text = element_text(size = 14),
axis.text = element_text(size = 14),
axis.title = element_text(size = 14),
plot.caption = element_text(size = 14)
)languages_per_source_plot